boston_cocktails <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/boston_cocktails.csv")
boston_cocktails %>%
count(ingredient, sort = TRUE)
## # A tibble: 569 x 2
## ingredient n
## <chr> <int>
## 1 Gin 176
## 2 Fresh lemon juice 138
## 3 Simple Syrup 115
## 4 Vodka 114
## 5 Light Rum 113
## 6 Dry Vermouth 107
## 7 Fresh Lime Juice 107
## 8 Triple Sec 107
## 9 Powdered Sugar 90
## 10 Grenadine 85
## # … with 559 more rows
cocktails_parsed <- boston_cocktails %>%
mutate(
ingredient = str_to_lower(ingredient), # get lower case
ingredient = str_replace_all(ingredient, "-", " "), # replace all dashes with spaces
ingredient = str_remove(ingredient, " liqueur$"),
ingredient = str_remove(ingredient, " (if desired)$"),
ingredient = case_when(
str_detect(ingredient, "bitters") ~ "bitters",
str_detect(ingredient, "lemon") ~ "lemon juice",
str_detect(ingredient, "lime") ~ "lime juice",
str_detect(ingredient, "grapefruit") ~ "grapefruit juice",
str_detect(ingredient, "orange") ~ "orange juice",
TRUE ~ ingredient
),
measure = case_when(
str_detect(ingredient, "bitters") ~ str_replace(measure, "oz$", "dash"),
TRUE ~ measure
),
measure = str_replace(measure, " ?1/2", ".5"), # change 1 1/2 oz to 1.5 oz
measure = str_replace(measure, " ?3/4", ".75"),
measure = str_replace(measure, " ?1/4", ".25"),
measure_number = parse_number(measure), # get out the number only
measure_number = if_else(str_detect(measure, "dash$"),
measure_number / 50,
measure_number
)
)
cocktails_parsed
## # A tibble: 3,643 x 7
## name category row_id ingredient_numb… ingredient measure measure_number
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Gauguin Cocktail … 1 1 light rum 2 oz 2
## 2 Gauguin Cocktail … 1 2 passion fr… 1 oz 1
## 3 Gauguin Cocktail … 1 3 lemon juice 1 oz 1
## 4 Gauguin Cocktail … 1 4 lime juice 1 oz 1
## 5 Fort L… Cocktail … 2 1 light rum 1.5 oz 1.5
## 6 Fort L… Cocktail … 2 2 sweet verm… .5 oz 0.5
## 7 Fort L… Cocktail … 2 3 orange jui… .25 oz 0.25
## 8 Fort L… Cocktail … 2 4 lime juice .25 oz 0.25
## 9 Apple … Cordials … 3 1 apple schn… 3 oz 3
## 10 Apple … Cordials … 3 2 cinnamon s… 1 oz 1
## # … with 3,633 more rows
cocktails_parsed %>%
count(ingredient, sort = TRUE)
## # A tibble: 440 x 2
## ingredient n
## <chr> <int>
## 1 lemon juice 311
## 2 lime juice 180
## 3 gin 176
## 4 bitters 166
## 5 orange juice 124
## 6 simple syrup 115
## 7 vodka 114
## 8 light rum 113
## 9 dry vermouth 107
## 10 triple sec 107
## # … with 430 more rows
We still have 440 ingredients, too many! Let’s filter for only ingredients that appear at least 15 times:
cocktails_parsed_filtered <- cocktails_parsed %>% # get a count of ingredients so we can filter out rare ones
add_count(ingredient) %>%
filter(n > 15) %>% # reduces no. ingredients from >400 down to 40
select(-n) %>%
distinct(row_id, ingredient, .keep_all = TRUE) %>% # to stop repetitions e.g. if we had lemon slice and lemon juice in a recipe then we will end up with 2 'lemon's as per our mutate() calls earlier
na.omit()
What are the msot common ingredients?
cocktails_parsed_filtered %>%
count(ingredient, sort = TRUE) %>%
head(20) %>%
mutate(ingredient = fct_reorder(ingredient, n)) %>%
ggplot(aes(n, ingredient)) +
geom_col() +
labs(title = "Most common ingredients in Mr Boston recipes")
cocktails_parsed_filtered %>%
count(ingredient, sort = TRUE) %>%
head(20) %>%
mutate(ingredient = fct_reorder(ingredient, n)) %>%
ggplot(aes(x = ingredient, y = n)) +
geom_dotplot(binaxis = "y", stackdir = "center") +
coord_flip() +
labs(title = "Most common ingredients in Mr Boston recipes")
cocktails_parsed_filtered %>%
distinct(name, category) %>%
count(category, sort = TRUE)
## # A tibble: 11 x 2
## category n
## <chr> <int>
## 1 Cocktail Classics 428
## 2 Vodka 137
## 3 Rum - Daiquiris 110
## 4 Whiskies 104
## 5 Tequila 85
## 6 Brandy 46
## 7 Gin 17
## 8 Cordials and Liqueurs 6
## 9 Shooters 2
## 10 Non-alcoholic Drinks 1
## 11 Rum 1
n_recipes <- n_distinct(cocktails_parsed_filtered$name)
cocktails_parsed_filtered %>%
count(category, ingredient, sort = TRUE) %>%
mutate(category = fct_lump(category, 4),
ingredient = fct_lump(ingredient, 20)) %>%
filter(ingredient != "Other") %>%
mutate(ingredient = fct_reorder(ingredient, n, sum)) %>%
ggplot(aes(n/n_recipes, ingredient, fill = category)) +
geom_col() +
scale_x_continuous(labels = percent_format()) +
labs(title = "Most common ingredients in Mr Boston recipes",
x = "% of all recipes",
y = "Ingredient",
fill = "Category")
Which ingredients tend to appear together in the same recipe?
library(widyr)
library(tidytext)
ingredient_pairs <- cocktails_parsed_filtered %>%
add_count(ingredient) %>% # like group_by then count
filter(n >= 5) %>% # filter only for ingredients that appear in min. 10 recipes
pairwise_cor(ingredient, name, sort = TRUE)
ingredient_pairs
## # A tibble: 1,560 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 whole egg powdered sugar 0.360
## 2 powdered sugar whole egg 0.360
## 3 port powdered sugar 0.278
## 4 powdered sugar port 0.278
## 5 dry vermouth gin 0.274
## 6 gin dry vermouth 0.274
## 7 egg yolk powdered sugar 0.251
## 8 powdered sugar egg yolk 0.251
## 9 egg white powdered sugar 0.215
## 10 powdered sugar egg white 0.215
## # … with 1,550 more rows
Which ingredients are most closely correlated with gin?:
ingredient_pairs %>%
filter(item1 == "gin") %>%
head(10) %>%
mutate(item2 = fct_reorder(item2, correlation)) %>%
ggplot(aes(correlation, item2)) +
geom_col() +
labs(title = "What ingredients are most correlated with gin?")
Note that dry vermouth is at the top - makes sense because dry vermouth + gin is a common base for martinis!
ingredient_pairs %>%
filter(item1 %in% c("gin", "vodka", "bourbon whiskey", "lemon juice")) %>%
group_by(item1) %>%
top_n(8, correlation) %>%
ungroup() %>% # wasn't included in David's script - need to include
mutate(item2 = reorder_within(item2, correlation, item1)) %>%
ggplot(aes(correlation, item2)) +
geom_col() +
facet_wrap(~ item1, scales = "free_y") +
scale_y_reordered() +
labs(title = "What ingredients are most correlated with particular ingreidents?")
ingredients_summarized <- cocktails_parsed_filtered %>%
group_by(name) %>%
mutate(percentile = row_number() / n()) %>% # account for fact that some recipes have many ingredients and some don't
ungroup() %>%
group_by(ingredient) %>%
summarize(n = n(),
n_with_measure = sum(!is.na(measure_number)),
avg_position = mean(percentile),
avg_serving = mean(measure_number, na.rm = TRUE)) %>%
arrange(desc(n))
#ingredients_summarized
library(ggraph)
library(igraph)
library(ggrepel)
top_cors <- ingredient_pairs %>%
head(70)
ingredient_info <- ingredients_summarized %>%
filter(ingredient %in% top_cors$item1) %>%
distinct()
# ~ 35 mins
top_cors %>%
graph_from_data_frame(vertices = ingredient_info) %>%
ggraph(layout = "fr") +
geom_edge_link() +
geom_node_text(aes(label = name), repel = TRUE) +
geom_node_point(aes(size = 1.1* n)) +
geom_node_point(aes(size = n, color = avg_position)) +
scale_color_gradient2(low = "red", high = "blue", midpoint = .5,
labels = scales::percent_format()) +
labs(size = "# of recipes",
color = "Avg position in drink",
title = "The universe of cocktail ingredients",
subtitle = "Connected ingredients tend to appear in the same drink. Red ingredients are early in the recipe, \n blue ingredients tend to be later")
I typically do my data cleaning with data in a tidy format, like boston_cocktails or cocktails_parsed. When it’s time for modeling, we usually need the data in a wider format, so let’s use pivot_wider() to reshape our data.
cocktails_df <- cocktails_parsed_filtered %>%
select(-ingredient_number, -row_id, -measure) %>% # remove unneeded columns
pivot_wider(names_from = ingredient, values_from = measure_number, values_fill = 0) %>%
janitor::clean_names()
# na.omit()
cocktails_df
## # A tibble: 937 x 42
## name category light_rum lemon_juice lime_juice sweet_vermouth orange_juice
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Gaug… Cocktai… 2 1 1 0 0
## 2 Fort… Cocktai… 1.5 0 0.25 0.5 0.25
## 3 Cuba… Cocktai… 2 0 0.5 0 0
## 4 Cool… Cocktai… 0 0 0 0 1
## 5 John… Whiskies 0 1 0 0 0
## 6 Cher… Cocktai… 1.25 0 0 0 0
## 7 Casa… Cocktai… 2 0 1.5 0 0
## 8 Cari… Cocktai… 0.5 0 0 0 0
## 9 Ambe… Cordial… 0 0.25 0 0 0
## 10 The … Whiskies 0 0.5 0 0 0
## # … with 927 more rows, and 35 more variables: powdered_sugar <dbl>,
## # dark_rum <dbl>, cranberry_juice <dbl>, pineapple_juice <dbl>,
## # bourbon_whiskey <dbl>, simple_syrup <dbl>, cherry_flavored_brandy <dbl>,
## # light_cream <dbl>, triple_sec <dbl>, maraschino <dbl>, amaretto <dbl>,
## # grenadine <dbl>, apple_brandy <dbl>, brandy <dbl>, gin <dbl>,
## # anisette <dbl>, dry_vermouth <dbl>, apricot_flavored_brandy <dbl>,
## # bitters <dbl>, straight_rye_whiskey <dbl>, benedictine <dbl>,
## # egg_white <dbl>, half_and_half <dbl>, vodka <dbl>, grapefruit_juice <dbl>,
## # blended_scotch_whiskey <dbl>, port <dbl>, white_creme_de_cacao <dbl>,
## # citrus_flavored_vodka <dbl>, whole_egg <dbl>, egg_yolk <dbl>,
## # blended_whiskey <dbl>, dubonnet <dbl>, blanco_tequila <dbl>,
## # old_mr_boston_dry_gin <dbl>
This dataset is especially delightful because we get to use recipes with recipes! Let’s load the tidymodels metapackage and implement principal component analysis with a recipe.
library(tidymodels)
pca_rec <- recipe(~., data = cocktails_df) %>%
update_role(name, category, new_role = "id") %>%
step_normalize(all_predictors()) %>% # centers and scales all predictors
step_pca(all_predictors())
pca_rec # PCA hasn't actually been run yet - see next step
## Data Recipe
##
## Inputs:
##
## role #variables
## id 2
## predictor 40
##
## Operations:
##
## Centering and scaling for all_predictors
## No PCA components were extracted.
Now we actually run PCA:
pca_prep <- prep(pca_rec) # takes the data and estimates the PCA values
#pca_prep
Let’s look at our results:
tidied_pca <- tidy(pca_prep, 2)
tidied_pca
## # A tibble: 1,600 x 4
## terms value component id
## <chr> <dbl> <chr> <chr>
## 1 light_rum 0.163 PC1 pca_VFQSw
## 2 lemon_juice -0.0140 PC1 pca_VFQSw
## 3 lime_juice 0.224 PC1 pca_VFQSw
## 4 sweet_vermouth -0.0661 PC1 pca_VFQSw
## 5 orange_juice 0.0308 PC1 pca_VFQSw
## 6 powdered_sugar -0.476 PC1 pca_VFQSw
## 7 dark_rum 0.124 PC1 pca_VFQSw
## 8 cranberry_juice 0.0954 PC1 pca_VFQSw
## 9 pineapple_juice 0.119 PC1 pca_VFQSw
## 10 bourbon_whiskey 0.0963 PC1 pca_VFQSw
## # … with 1,590 more rows
tidied_pca %>%
filter(component %in% paste0("PC", 1:5)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(value, terms, fill = terms)) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, nrow = 1) +
labs(y = NULL)
library(tidytext)
tidied_pca %>%
filter(component %in% paste0("PC", 1:4)) %>%
group_by(component) %>%
top_n(8, abs(value)) %>%
ungroup() %>%
mutate(terms = reorder_within(terms, by = abs(value), within = component)) %>% # from tidytext package - reorders column before plotting with faceting, such that the values are ordered within each facet...needs scale_x_reordered or scale_y_reordered to be used laster
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col() +
facet_wrap(~component, , scales = "free_y") +
scale_y_reordered() + # see earlier comment
labs(
x = "Absolute value of contribution",
y = NULL, fill = "Positive?"
)
So PC1 is about powdered sugar + egg + gin drinks vs. simple syrup + lime + tequila drinks. This is the component that explains the most variation in drinks. PC2 is mostly about vermouth, both sweet and dry.
How are the cocktails distributed in the plane of the first two components?
juice(pca_prep) %>% # get info from processed dataset
ggplot(aes(PC1, PC2, label = name)) +
geom_point(aes(color = category), alpha = 0.7, size = 2) +
geom_text(check_overlap = TRUE, hjust = "inward") + # so that labels dont overlap
labs(color = NULL)